PM 536 Lab 11

Author

Erin Ross

library(plotly)
Loading required package: ggplot2

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
library(ggplot2)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ lubridate 1.9.2     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.0
✔ readr     2.1.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks plotly::filter(), stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(zoo)

Attaching package: 'zoo'

The following objects are masked from 'package:base':

    as.Date, as.Date.numeric

1. Read in the data

## data extracted from New York Times state-level data from NYT Github repository
# https://github.com/nytimes/covid-19-data

## state-level population information from us_census_data available on GitHub repository:
# https://github.com/COVID19Tracking/associated-data/tree/master/us_census_data

# load COVID state-level data from NYT
cv_states <- read.csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv")
str(cv_states)
'data.frame':   61942 obs. of  5 variables:
 $ date  : chr  "2020-01-21" "2020-01-22" "2020-01-23" "2020-01-24" ...
 $ state : chr  "Washington" "Washington" "Washington" "Illinois" ...
 $ fips  : int  53 53 53 17 53 6 17 53 4 6 ...
 $ cases : int  1 1 1 1 1 1 1 1 1 2 ...
 $ deaths: int  0 0 0 0 0 0 0 0 0 0 ...
table(cv_states$state)

                 Alabama                   Alaska           American Samoa 
                    1106                     1107                      548 
                 Arizona                 Arkansas               California 
                    1153                     1108                     1154 
                Colorado              Connecticut                 Delaware 
                    1114                     1111                     1108 
    District of Columbia                  Florida                  Georgia 
                    1112                     1118                     1117 
                    Guam                   Hawaii                    Idaho 
                    1104                     1113                     1106 
                Illinois                  Indiana                     Iowa 
                    1155                     1113                     1111 
                  Kansas                 Kentucky                Louisiana 
                    1112                     1113                     1110 
                   Maine                 Maryland            Massachusetts 
                    1107                     1114                     1147 
                Michigan                Minnesota              Mississippi 
                    1109                     1113                     1108 
                Missouri                  Montana                 Nebraska 
                    1112                     1106                     1131 
                  Nevada            New Hampshire               New Jersey 
                    1114                     1117                     1115 
              New Mexico                 New York           North Carolina 
                    1108                     1118                     1116 
            North Dakota Northern Mariana Islands                     Ohio 
                    1108                     1091                     1110 
                Oklahoma                   Oregon             Pennsylvania 
                    1113                     1120                     1113 
             Puerto Rico             Rhode Island           South Carolina 
                    1106                     1118                     1113 
            South Dakota                Tennessee                    Texas 
                    1109                     1114                     1136 
                    Utah                  Vermont           Virgin Islands 
                    1123                     1112                     1105 
                Virginia               Washington            West Virginia 
                    1112                     1158                     1102 
               Wisconsin                  Wyoming 
                    1143                     1108 
#Non-states = Virgin Islands, Puerto Rico, Northern Mariana Islands, DC, Guam, American Samoa

# load state population data
state_pops <- read.csv("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv")
str(state_pops)
'data.frame':   52 obs. of  5 variables:
 $ state      : chr  "AL" "AK" "AZ" "AR" ...
 $ state_name : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
 $ geo_id     : int  1 2 4 5 6 8 9 10 11 12 ...
 $ population : int  4887871 737438 7171646 3013825 39557045 5695564 3572665 967171 702455 21299325 ...
 $ pop_density: num  96.51 1.29 63.14 57.92 253.91 ...
state_pops$abb <- state_pops$state
state_pops$state <- state_pops$state_name
state_pops$state_name <- NULL

table(state_pops$state)

             Alabama               Alaska              Arizona 
                   1                    1                    1 
            Arkansas           California             Colorado 
                   1                    1                    1 
         Connecticut             Delaware District of Columbia 
                   1                    1                    1 
             Florida              Georgia               Hawaii 
                   1                    1                    1 
               Idaho             Illinois              Indiana 
                   1                    1                    1 
                Iowa               Kansas             Kentucky 
                   1                    1                    1 
           Louisiana                Maine             Maryland 
                   1                    1                    1 
       Massachusetts             Michigan            Minnesota 
                   1                    1                    1 
         Mississippi             Missouri              Montana 
                   1                    1                    1 
            Nebraska               Nevada        New Hampshire 
                   1                    1                    1 
          New Jersey           New Mexico             New York 
                   1                    1                    1 
      North Carolina         North Dakota                 Ohio 
                   1                    1                    1 
            Oklahoma               Oregon         Pennsylvania 
                   1                    1                    1 
         Puerto Rico         Rhode Island       South Carolina 
                   1                    1                    1 
        South Dakota            Tennessee                Texas 
                   1                    1                    1 
                Utah              Vermont             Virginia 
                   1                    1                    1 
          Washington        West Virginia            Wisconsin 
                   1                    1                    1 
             Wyoming 
                   1 
#Non-states = DC, Puerto Rico

cv_states <- merge(cv_states, state_pops, by="state")

2. Look at the data

Inspect the dimensions, head, and tail of the data
Inspect the structure of each variables. Are they in the correct format?

We had two datasets with 5 variables each. Once we merged them, we have 9 variables, which is expected. We lost observations from US Virgin Islands, Northern Mariana Islands, Guam, and American Samoa when we merged (~10000).

dim(cv_states)
[1] 58094     9
head(cv_states)
    state       date fips   cases deaths geo_id population pop_density abb
1 Alabama 2023-01-04    1 1587224  21263      1    4887871    96.50939  AL
2 Alabama 2020-04-25    1    6213    213      1    4887871    96.50939  AL
3 Alabama 2023-02-26    1 1638348  21400      1    4887871    96.50939  AL
4 Alabama 2022-12-03    1 1549285  21129      1    4887871    96.50939  AL
5 Alabama 2020-05-06    1    8691    343      1    4887871    96.50939  AL
6 Alabama 2021-04-21    1  524367  10807      1    4887871    96.50939  AL
tail(cv_states)
        state       date fips  cases deaths geo_id population pop_density abb
58089 Wyoming 2022-09-11   56 175290   1884     56     577737    5.950611  WY
58090 Wyoming 2022-08-21   56 173487   1871     56     577737    5.950611  WY
58091 Wyoming 2021-01-26   56  51152    596     56     577737    5.950611  WY
58092 Wyoming 2021-02-21   56  53795    662     56     577737    5.950611  WY
58093 Wyoming 2021-08-22   56  70671    809     56     577737    5.950611  WY
58094 Wyoming 2021-03-20   56  55581    693     56     577737    5.950611  WY
str(cv_states)
'data.frame':   58094 obs. of  9 variables:
 $ state      : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
 $ date       : chr  "2023-01-04" "2020-04-25" "2023-02-26" "2022-12-03" ...
 $ fips       : int  1 1 1 1 1 1 1 1 1 1 ...
 $ cases      : int  1587224 6213 1638348 1549285 8691 524367 1321892 1088370 1153149 814025 ...
 $ deaths     : int  21263 213 21400 21129 343 10807 19676 16756 16826 15179 ...
 $ geo_id     : int  1 1 1 1 1 1 1 1 1 1 ...
 $ population : int  4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
 $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
 $ abb        : chr  "AL" "AL" "AL" "AL" ...

3. Format the data

Make date into a date variable
Make state into a factor variable
Order the data first by state, second by date
Confirm the variables are now correctly formatted
Inspect the range values for each variable. What is the date range? The range of cases and deaths?

The date range is January 21, 2020 to March 23, 2023. The range of cases is 1-12 Million (!), and the range of deaths is 0-104,277. There are some missing values for population density.

# format the date
cv_states$date <- as.Date(cv_states$date, format="%Y-%m-%d")

# format the state and state abbreviation (abb) variables
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)

# order the data first by state, second by date
cv_states <- cv_states %>% arrange(state, date)

# Confirm the variables are now correctly formatted
str(cv_states)
'data.frame':   58094 obs. of  9 variables:
 $ state      : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ date       : Date, format: "2020-03-13" "2020-03-14" ...
 $ fips       : int  1 1 1 1 1 1 1 1 1 1 ...
 $ cases      : int  6 12 23 29 39 51 78 106 131 157 ...
 $ deaths     : int  0 0 0 0 0 0 0 0 0 0 ...
 $ geo_id     : int  1 1 1 1 1 1 1 1 1 1 ...
 $ population : int  4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
 $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
 $ abb        : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
head(cv_states)
    state       date fips cases deaths geo_id population pop_density abb
1 Alabama 2020-03-13    1     6      0      1    4887871    96.50939  AL
2 Alabama 2020-03-14    1    12      0      1    4887871    96.50939  AL
3 Alabama 2020-03-15    1    23      0      1    4887871    96.50939  AL
4 Alabama 2020-03-16    1    29      0      1    4887871    96.50939  AL
5 Alabama 2020-03-17    1    39      0      1    4887871    96.50939  AL
6 Alabama 2020-03-18    1    51      0      1    4887871    96.50939  AL
tail(cv_states)
        state       date fips  cases deaths geo_id population pop_density abb
58089 Wyoming 2023-03-18   56 185640   2009     56     577737    5.950611  WY
58090 Wyoming 2023-03-19   56 185640   2009     56     577737    5.950611  WY
58091 Wyoming 2023-03-20   56 185640   2009     56     577737    5.950611  WY
58092 Wyoming 2023-03-21   56 185800   2014     56     577737    5.950611  WY
58093 Wyoming 2023-03-22   56 185800   2014     56     577737    5.950611  WY
58094 Wyoming 2023-03-23   56 185800   2014     56     577737    5.950611  WY
# Inspect the range values for each variable. What is the date range? The range of cases and deaths?
head(cv_states)
    state       date fips cases deaths geo_id population pop_density abb
1 Alabama 2020-03-13    1     6      0      1    4887871    96.50939  AL
2 Alabama 2020-03-14    1    12      0      1    4887871    96.50939  AL
3 Alabama 2020-03-15    1    23      0      1    4887871    96.50939  AL
4 Alabama 2020-03-16    1    29      0      1    4887871    96.50939  AL
5 Alabama 2020-03-17    1    39      0      1    4887871    96.50939  AL
6 Alabama 2020-03-18    1    51      0      1    4887871    96.50939  AL
summary(cv_states)
           state            date                 fips           cases         
 Washington   : 1158   Min.   :2020-01-21   Min.   : 1.00   Min.   :       1  
 Illinois     : 1155   1st Qu.:2020-12-06   1st Qu.:16.00   1st Qu.:  112125  
 California   : 1154   Median :2021-09-11   Median :29.00   Median :  418120  
 Arizona      : 1153   Mean   :2021-09-10   Mean   :29.78   Mean   :  947941  
 Massachusetts: 1147   3rd Qu.:2022-06-17   3rd Qu.:44.00   3rd Qu.: 1134318  
 Wisconsin    : 1143   Max.   :2023-03-23   Max.   :72.00   Max.   :12169158  
 (Other)      :51184                                                          
     deaths           geo_id        population        pop_density       
 Min.   :     0   Min.   : 1.00   Min.   :  577737   Min.   :    1.292  
 1st Qu.:  1598   1st Qu.:16.00   1st Qu.: 1805832   1st Qu.:   43.659  
 Median :  5901   Median :29.00   Median : 4468402   Median :  107.860  
 Mean   : 12553   Mean   :29.78   Mean   : 6397965   Mean   :  423.031  
 3rd Qu.: 15952   3rd Qu.:44.00   3rd Qu.: 7535591   3rd Qu.:  229.511  
 Max.   :104277   Max.   :72.00   Max.   :39557045   Max.   :11490.120  
                                                     NA's   :1106       
      abb       
 WA     : 1158  
 IL     : 1155  
 CA     : 1154  
 AZ     : 1153  
 MA     : 1147  
 WI     : 1143  
 (Other):51184  
min(cv_states$date)
[1] "2020-01-21"
max(cv_states$date)
[1] "2023-03-23"

4. Add new_cases and new_deaths and correct outliers

Add variables for new cases, new_cases, and new deaths, new_deaths: (Hint: You can set new_cases equal to the difference between cases on date i and date i-1, starting on date i=2). Filter to dates after June 1, 2021. Use plotly for EDA: See if there are outliers or values that don't make sense for new_cases and new_deaths. Which states and which dates have strange values?

In California there were huge spikes in new cases on 9/3/21, 9/16/21, 5/15/21, 7/4/22, 11/11/22, 1/22/223.

In Florida, there were spikes in new cases on 7/29/22 and 2/18/23.

In Texas, there was a spike in new cases on 11/29/22.

There are some negative new case and new deaths in multiple states on multiple dates, the most startling number being -3770 new deaths in MA on 10/08/22, and -598 deaths in Ohio on 12/16/22.

Correct outliers: Set negative values for new_cases or new_deaths to 0. Recalculate cases and deaths as cumulative sum of updated new_cases and new_deaths.Get the rolling average of new cases and new deaths to smooth over time. Inspect data again interactively.

New deaths make a lot more sense now – no negative values. Still see spikes in deaths in various states that may represent batched reporting of deaths?

# Add variables for new_cases and new_deaths:
for (i in 1:length(state_list)) {
  cv_subset = subset(cv_states, state == state_list[i])
  cv_subset = cv_subset[order(cv_subset$date),]

  # add starting level for new cases and deaths
  cv_subset$new_cases = cv_subset$cases[1]
  cv_subset$new_deaths = cv_subset$deaths[1]

  ##FINISH
  for (j in 2:nrow(cv_subset)) {
    cv_subset$new_cases[j] = cv_subset$cases[j] - cv_subset$cases[j-1]
    cv_subset$new_deaths[j] = cv_subset$deaths[j] - cv_subset$deaths[j-1]
  }

  # include in main dataset
  cv_states$new_cases[cv_states$state==state_list[i]] = cv_subset$new_cases
  cv_states$new_deaths[cv_states$state==state_list[i]] = cv_subset$new_deaths
}

# Focus on recent dates
cv_states <- cv_states %>% dplyr::filter(date >= "2021-06-01")

### FINISH
# Inspect outliers in new_cases using plotly
p1<-ggplot(cv_states, aes(x = date, y = new_cases, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p1)
p1<-NULL # to clear from workspace

p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2<-NULL # to clear from workspace

# set negative new case or death counts to 0
cv_states$new_cases[cv_states$new_cases<0] = 0
cv_states$new_deaths[cv_states$new_deaths<0] = 0

# Recalculate `cases` and `deaths` as cumulative sum of updated `new_cases` and `new_deaths`
for (i in 1:length(state_list)) {
  cv_subset = subset(cv_states, state == state_list[i])

  # add starting level for new cases and deaths
  cv_subset$cases = cv_subset$cases[1]
  cv_subset$deaths = cv_subset$deaths[1]

  ### FINISH
  for (j in 2:nrow(cv_subset)) {
    cv_subset$cases[j] = cv_subset$new_cases[j] + cv_subset$new_cases[j-1]
    cv_subset$deaths[j] = cv_subset$new_deaths[j] + cv_subset$new_deaths[j-1]
  }
  # include in main dataset
  cv_states$cases[cv_states$state==state_list[i]] = cv_subset$cases
  cv_states$deaths[cv_states$state==state_list[i]] = cv_subset$deaths
}

# Smooth new counts
cv_states$new_cases = zoo::rollmean(cv_states$new_cases, k=7, fill=NA, align='right') %>% round(digits = 0)
cv_states$new_deaths = zoo::rollmean(cv_states$new_deaths, k=7, fill=NA, align='right') %>% round(digits = 0)

# Inspect data again interactively
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2=NULL

5. Add additional variables

Add population-normalized (by 100,000) variables for each variable type (rounded to 1 decimal place). Make sure the variables you calculate are in the correct format (numeric). You can use the following variable names:
  • per100k = cases per 100,000 population

  • newper100k= new cases per 100,000

  • deathsper100k = deaths per 100,000

  • newdeathsper100k = new deaths per 100,000

Add a "naive CFR" variable representing deaths / cases on each date for each state. Create a dataframe representing values on the most recent date, cv_states_today, as done in lecture.
### FINISH
# add population normalized (by 100,000) counts for each variable
cv_states$per100k =  as.numeric(format(round(cv_states$cases/(cv_states$population/100000),1),nsmall=1))
cv_states$newper100k =  as.numeric(format(round(cv_states$new_cases/(cv_states$population/100000),1),nsmall=1))
Warning: NAs introduced by coercion
cv_states$deathsper100k =  as.numeric(format(round(cv_states$deaths/(cv_states$population/100000),1),nsmall=1))
cv_states$newdeathsper100k =  as.numeric(format(round(cv_states$new_deaths/(cv_states$population/100000),1),nsmall=1))
Warning: NAs introduced by coercion
# add a naive_CFR variable = deaths / cases
cv_states = cv_states %>% mutate(naive_CFR = round((deaths*100/cases),2))

# create a `cv_states_today` variable
cv_states_today = subset(cv_states, date==max(cv_states$date))

6. Explore scatterplots using plot_ly()

Create a scatterplot using plot_ly() representing pop_density vs. various variables (e.g. cases, per100k, deaths, deathsper100k) for each state on most recent date (cv_states_today)
Color points by state and size points by state population. Use hover to identify any outliers. Remove those outliers and replot. Choose one plot. Add hoverinfo specifying the state name, cases per 100k, and deaths per 100k, similarly to how we did this in the lecture notes. Add layout information to title the chart and the axes. Enable hovermode = "compare"
### FINISH CODE HERE
# pop_density vs. deaths
cv_states_today %>% 
  plot_ly(x = ~pop_density, y = ~deaths, 
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
Warning: Ignoring 1 observations
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
# filter out "District of Columbia"
cv_states_today_filter <- cv_states_today %>% filter(state!="District of Columbia")

# pop_density vs. cases after filtering
cv_states_today_filter %>% 
  plot_ly(x = ~pop_density, y = ~deaths, 
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
Warning: Ignoring 1 observations

Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
# pop_density vs. deathsper100k
cv_states_today_filter %>% 
  plot_ly(x = ~pop_density, y = ~deathsper100k,
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
Warning: Ignoring 1 observations

Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
# Adding hoverinfo
cv_states_today_filter %>% 
  plot_ly(x = ~pop_density, y = ~deathsper100k,
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5),
          hoverinfo = 'text',
          text = ~paste( paste(state, ":", sep=""), paste(" Cases per 100k: ", per100k, sep="") , 
                         paste(" Deaths per 100k: ", deathsper100k, sep=""), sep = "<br>")) %>%
  layout(title = "Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",
                  yaxis = list(title = "Deaths per 100k"), xaxis = list(title = "Population Density"),
         hovermode = "compare")
Warning: Ignoring 1 observations

Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

7. Explore scatterplot trend interactively using ggplotly() and geom_smooth()

For pop_density vs. newdeathsper100k create a chart with the same variables using gglot_ly().Explore the pattern between \(x\) and \(y\) using geom_smooth().Explain what you see. Do you think pop_density is a correlate of newdeathsper100k?

There does seem to be a relationship of state-level population density with deaths, though this is really only apparent at higher population densities. At low densities there is high variability in death rates (probably politically influenced). County-level population density vs death rates may have more interesting results.

### FINISH 
p <- ggplot(cv_states_today_filter, aes(x=pop_density, y=deathsper100k, size=population)) + geom_point(aes(color=state)) + geom_smooth() +
  labs(title = "Deaths per 100,000 People by State Population Density",
       x = "Population Density",
       y = "Deaths per 100,000 People") +
  scale_color_discrete(name = "State, Size by\nTotal Population")
ggplotly(p)
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
Warning: The following aesthetics were dropped during statistical transformation: size
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?

8. Multiple line chart

Create a line chart of the naive_CFR for all states over time using plot_ly().Use the zoom and pan tools to inspect the naive_CFR for the states that had an increase in September. How have they changed over time?

In most states, CFR was stable over time. Exceptions include an increase in CFR for Kansas in April 2022, a peak in Texas on Sept 11 2022, and a very sharp and likely erroneous CFR of 118k in California in Nov 2022.

Create one more line chart, for Florida only, which shows new_cases and new_deaths together in one plot. Hint: use add_layer()Use hoverinfo to "eyeball" the approximate peak of deaths and peak of cases. What is the time delay between the peak of cases and the peak of deaths?

The peaks in deaths are so small it is hard to see the true peak cases on the screen at the same time as the peaks in deaths, but the largest peak in cases is on Jul 29 2022 with 193k cases, and the next peak in deaths comes on August 11 2022 with 649 (this date was also a peak case-reporting date). This reporting cycle with such high peaks and troughs on a daily basis and concurrent peaks in deaths/cases indicates there may be batch reporting of both cases and deaths occurring. However, the two week lag from peak cases to next peak of deaths (not same day) fits with our knowledge of the natural history of COVID pneumonia.

### FINISH
# Line chart for naive_CFR for all states over time using `plot_ly()`
plot_ly(cv_states, x = ~date, y = ~naive_CFR, color = ~state, type = "scatter", mode = "lines")
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
### FINISH
# Line chart for Florida showing new_cases and new_deaths together
cv_states %>% 
  filter(state=="Florida") %>% 
  arrange(date) %>%
  plot_ly(opacity = 0.5) %>%
    add_lines(x = ~date, y = ~new_cases, type = "scatter", mode = "line", color = I("blue"), name = "New Cases") %>%
  add_lines(x = ~date, y = ~new_deaths, type = "scatter", mode = "line", color = I("red"), name = "New Deaths")

9. Heatmaps

Create a heatmap to visualize new_cases for each state on each date greater than June 1st, 2021. Start by mapping selected features in the dataframe into a matrix using the tidyr package function pivot_wider(), naming the rows and columns, as done in the lecture notes.
Use plot_ly() to create a heatmap out of this matrix. Which states stand out?

My matrix looks correct but the plot is only showing the new case rate for a three month window, no matter what I set the filtering date as. So I’ve truncated it down to the window that shows up – and the dates when I hover are October 2, 2022 for all states?? Florida has a stand out total case rate on this date.

Repeat with newper100k variable. Now which states stand out?

Same problem as above – but states with higher cases per 100,000 people include Connecticut, Florida, and Montana on Oct 2, 2022.

Create a second heatmap in which the pattern of new_cases for each state over time becomes more clear by filtering to only look at dates every two weeks.

This time we at least got a few different dates plotted. There is a stand-out point in Tennessee in December 2021…though the square is over July 2022 :(.

### FINISH
# Map state, date, and new_cases to a matrix
cv_states_mat <- cv_states %>% 
  select(state, date, new_cases) %>% 
  filter(date>="2021-06-01")
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, 
                                            names_from = state, 
                                            values_from = new_cases))
str(cv_states_mat2)
'data.frame':   661 obs. of  53 variables:
 $ date                : Date, format: "2021-06-01" "2021-06-02" ...
 $ Alabama             : num  NA NA NA NA NA NA 417 356 321 301 ...
 $ Alaska              : num  296 301 309 317 317 37 44 29 33 25 ...
 $ Arizona             : num  71 202 248 297 266 339 392 451 382 389 ...
 $ Arkansas            : num  389 424 452 488 517 150 159 182 176 190 ...
 $ California          : num  186 318 473 641 760 ...
 $ Colorado            : num  2358 2434 2553 2486 2497 ...
 $ Connecticut         : num  338 354 313 241 195 130 107 84 83 82 ...
 $ Delaware            : num  109 112 87 88 66 55 43 42 45 43 ...
 $ District of Columbia: num  62 64 68 72 72 24 30 17 17 16 ...
 $ Florida             : num  865 1042 1310 1310 1310 ...
 $ Georgia             : num  34 102 171 240 289 327 366 406 410 404 ...
 $ Hawaii              : num  406 409 415 424 433 46 51 47 50 53 ...
 $ Idaho               : num  114 139 161 185 185 112 146 122 118 110 ...
 $ Illinois            : num  162 229 330 419 493 443 480 473 464 421 ...
 $ Indiana             : num  1293 1330 1504 1560 1620 ...
 $ Iowa                : num  7 30 48 63 76 87 91 97 88 82 ...
 $ Kansas              : num  240 301 301 350 350 110 141 144 124 125 ...
 $ Kentucky            : num  241 282 326 383 417 236 250 270 272 264 ...
 $ Louisiana           : num  446 537 582 328 328 328 442 317 336 358 ...
 $ Maine               : num  606 614 629 640 653 65 69 73 72 66 ...
 $ Maryland            : num  142 151 171 191 83 98 110 106 115 114 ...
 $ Massachusetts       : num  270 289 326 309 245 221 179 169 171 151 ...
 $ Michigan            : num  502 576 652 732 788 790 492 405 372 350 ...
 $ Minnesota           : num  872 891 924 960 149 186 214 213 214 203 ...
 $ Mississippi         : num  379 394 421 441 441 441 115 118 118 119 ...
 $ Missouri            : num  173 245 340 430 488 337 404 478 496 497 ...
 $ Montana             : num  374 397 415 411 106 99 80 97 84 80 ...
 $ Nebraska            : num  24 28 35 33 33 34 46 35 35 32 ...
 $ Nevada              : num  289 349 399 263 254 243 324 301 272 264 ...
 $ New Hampshire       : num  165 168 175 180 180 180 31 25 29 29 ...
 $ New Jersey          : num  109 148 188 179 228 258 254 260 247 256 ...
 $ New Mexico          : num  486 496 511 349 266 185 124 85 86 84 ...
 $ New York            : num  230 288 357 410 503 535 570 567 570 579 ...
 $ North Carolina      : num  1130 1077 1069 1061 822 ...
 $ North Dakota        : num  633 645 652 660 664 36 39 42 36 36 ...
 $ Ohio                : num  89 137 207 295 349 388 424 396 404 383 ...
 $ Oklahoma            : num  1012 1018 1035 1057 1057 ...
 $ Oregon              : num  311 365 395 463 502 533 264 283 277 290 ...
 $ Pennsylvania        : num  387 464 571 658 713 494 562 503 479 468 ...
 $ Puerto Rico         : num  752 754 758 769 780 45 54 54 53 56 ...
 $ Rhode Island        : num  285 236 200 182 173 112 47 37 37 36 ...
 $ South Carolina      : num  75 112 127 141 173 199 138 155 150 147 ...
 $ South Dakota        : num  284 289 292 294 10 10 12 16 12 13 ...
 $ Tennessee           : num  168 215 255 290 290 199 246 195 175 161 ...
 $ Texas               : num  585 921 1364 1598 1783 ...
 $ Utah                : num  1374 1402 1451 1492 1536 ...
 $ Vermont             : num  153 154 155 157 157 157 9 10 10 11 ...
 $ Virginia            : num  44 70 107 144 185 207 185 202 204 198 ...
 $ Washington          : num  519 627 750 725 760 709 752 664 641 626 ...
 $ West Virginia       : num  618 634 655 672 672 135 179 111 108 105 ...
 $ Wisconsin           : num  156 193 221 252 252 105 147 167 149 147 ...
 $ Wyoming             : num  527 537 552 564 305 217 90 69 73 67 ...
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
             z=cv_states_mat2,
             type="heatmap",
             showscale=T) %>%
  layout(yaxis = list(range=c("2022-12-01","2023-04-30")))
# Repeat with newper100k
cv_states_mat <- cv_states %>% 
  select(state, date, newper100k) %>% 
  dplyr::filter(date>"2021-06-01")
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
             z=~cv_states_mat2,
             type="heatmap",
             showscale=T) %>%
  layout(yaxis = list(range=c("2022-12-01","2023-04-30")))
# Create a second heatmap after filtering to only include dates every other week
filter_dates <- seq(as.Date("2021-06-15"), as.Date("2023-11-01"), by=14)

cv_states_mat <- cv_states %>% select(state, date, newper100k) %>% filter(date %in% filter_dates)
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
             z=~cv_states_mat2,
             type="heatmap",
             showscale=T)

10. Map

Create a map to visualize the naive_CFR by state on October 15, 2021. Compare with a map visualizing the naive_CFR by state on most recent date. Plot the two maps together using subplot(). Make sure the shading is for the same range of values (google is your friend for this)
Describe the difference in the pattern of the CFR.

The case-fatality rate on Nov 10, 2021, we see peak fatalities were in Rhode Island (92.5) and Illinois (71.4). Interestingly, the case-fatality rates today are higher, with peaks in Virginia (102.8) Florida (79.8), Tennessee (82.2), Hawaii (81.2), and Illinois (83.5). The distribution of CFR has higher death rates in the East/South for both days, but there are actually more states with high CFR today than during “peak” pandemic times.

### For specified date
pick.date = "2021-11-10"

# Extract the data for each state by its abbreviation
cv_per100 <- cv_states %>% filter(date==pick.date) %>% select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL

# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>', "Cases per 100k: ", newper100k, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))

# Set up mapping details
set_map_details <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)

# Make sure both maps are on the same color scale
shadeLimit <- 125

# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>% 
  add_trace(
    z = ~newper100k, text = ~hover, locations = ~state,
    color = ~newper100k, colors = 'Purples'
  )
fig <- fig %>% colorbar(title = paste0("Cases per 100k: ", pick.date), limits = c(0,shadeLimit))
fig <- fig %>% layout(
    title = paste('Cases per 100k by State as of ', pick.date, '<br>(Hover for value)'),
    geo = set_map_details
  )
fig_pick.date <- fig

#############
### Map for today's date

# Extract the data for each state by its abbreviation
cv_per100 <- cv_states_today %>%  select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL

# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>', "Cases per 100k: ", newper100k, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))

# Set up mapping details
set_map_details <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)

# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>% 
  add_trace(
    z = ~newper100k, text = ~hover, locations = ~state,
    color = ~newper100k, colors = 'Purples'
  )
fig <- fig %>% colorbar(title = paste0("Cases per 100k: ", Sys.Date()), limits = c(0,shadeLimit))
fig <- fig %>% layout(
    title = paste('Cases per 100k by State as of', Sys.Date(), '<br>(Hover for value)'),
    geo = set_map_details
  )
fig_Today <- fig

fig_Today
### Plot together 
subplot(fig_pick.date, fig_Today, nrows = 2, margin = .05)